package machinelearning;

/*	Please see the license information in the header below. */
import java.util.*;

/**	LancasterStemmer: Implements the Lancaster (Paice/Husk) word stemmer.
 *
 *	<p>
 *	Paice/Husk Stemmer - License Statement.
 *	</p>
 *
 *	<p>
 *	This software was designed and developed at Lancaster
 *	University, Lancaster, UK, under the supervision of Dr Chris Paice.
 *	It is fully in the public domain, and may be used or adapted by any
 *	organisation or individual. Neither Dr Paice nor Lancaster
 *	University accepts any responsibility whatsoever for its use by
 *	other parties, and makes no guarantees, expressed or implied, about
 *	its quality, reliability, or any other characteristic.
 *	</p>
 *
 *	<p>
 *	It is assumed that, as a matter of professional courtesy, anyone
 *	who incorporates this software into a system of their own, whether
 *	for commercial or research purposes, will acknowledge the source of
 *	the code.
 *	</p>
 *
 *	<p>
 *	Modified from the original Java programs written by Christopher O'Neill
 *	and Rob Hooper.
 *	</p>
 */
public class LancasterStemmer
{

    /**	Prefixes to remove from words before stemming.
     */
    public static final String[] prefixes =
    {
        "intra",
        "kilo",
        "mega",
        "micro",
        "milli",
        "nano",
        "pico",
        "pseudo",
        "ultra",
    };
    /**	Default stemming rules.
     *
     *	<p>
     *	These rules MUST be stored in ascending alphanumeric order
     * of the first character.
     *	</p>
     */
    public static final String[] defaultStemmingRules =
            new String[]
    {
        "ai*2.     { -ia > -   if intact }",
        "a*1.      { -a > -    if intact }",
        "bb1.      { -bb > -b   }",
        "city3s.   { -ytic > -ys }",
        "ci2>      { -ic > -    }",
        "cn1t>     { -nc > -nt  }",
        "dd1.      { -dd > -d   }",
        "dei3y>    { -ied > -y  }",
        "deec2ss.  { -ceed > -cess }",
        "dee1.     { -eed > -ee }",
        "de2>      { -ed > -    }",
        "dooh4>    { -hood > -  }",
        "e1>       { -e > -     }",
        "feil1v.   { -lief > -liev }",
        "fi2>      { -if > -    }",
        "gni3>     { -ing > -   }",
        "gai3y.    { -iag > -y  }",
        "ga2>      { -ag > -    }",
        "gg1.      { -gg > -g   }",
        "ht*2.     { -th > -   if intact }",
        "hsiug5ct. { -guish > -ct }",
        "hsi3>     { -ish > -   }",
        "i*1.      { -i > -    if intact }",
        "i1y>      { -i > -y    }",
        "ji1d.     { -ij > -id   --  see nois4j> & vis3j> }",
        "juf1s.    { -fuj > -fus }",
        "ju1d.     { -uj > -ud  }",
        "jo1d.     { -oj > -od  }",
        "jeh1r.    { -hej > -her }",
        "jrev1t.   { -verj > -vert }",
        "jsim2t.   { -misj > -mit }",
        "jn1d.     { -nj > -nd  }",
        "j1s.      { -j > -s    }",
        "lbaifi6.  { -ifiabl > - }",
        "lbai4y.   { -iabl > -y }",
        "lba3>     { -abl > -   }",
        "lbi3.     { -ibl > -   }",
        "lib2l>    { -bil > -bl }",
        "lc1.      { -cl > c    }",
        "lufi4y.   { -iful > -y }",
        "luf3>     { -ful > -   }",
        "lu2.      { -ul > -    }",
        "lai3>     { -ial > -   }",
        "lau3>     { -ual > -   }",
        "la2>      { -al > -    }",
        "ll1.      { -ll > -l   }",
        "mui3.     { -ium > -   }",
        "mu*2.     { -um > -   if intact }",
        "msi3>     { -ism > -   }",
        "mm1.      { -mm > -m   }",
        "nois4j>   { -sion > -j }",
        "noix4ct.  { -xion > -ct }",
        "noi3>     { -ion > -   }",
        "nai3>     { -ian > -   }",
        "na2>      { -an > -    }",
        "nee0.     { protect  -een }",
        "ne2>      { -en > -    }",
        "nn1.      { -nn > -n   }",
        "pihs4>    { -ship > -  }",
        "pp1.      { -pp > -p   }",
        "re2>      { -er > -    }",
        "rae0.     { protect  -ear }",
        "ra2.      { -ar > -    }",
        "ro2>      { -or > -    }",
        "ru2>      { -ur > -    }",
        "rr1.      { -rr > -r   }",
        "rt1>      { -tr > -t   }",
        "rei3y>    { -ier > -y  }",
        "sei3y>    { -ies > -y  }",
        "sis2.     { -sis > -s  }",
        "si2>      { -is > -    }",
        "ssen4>    { -ness > -  }",
        "ss0.      { protect  -ss }",
        "suo3>     { -ous > -   }",
        "su*2.     { -us > -   if intact }",
        "s*1>      { -s > -    if intact }",
        "s0.       { -s > -s    }",
        "tacilp4y. { -plicat > -ply }",
        "ta2>      { -at > -    }",
        "tnem4>    { -ment > -  }",
        "tne3>     { -ent > -   }",
        "tna3>     { -ant > -   }",
        "tpir2b.   { -ript > -rib }",
        "tpro2b.   { -orpt > -orb }",
        "tcud1.    { -duct > -duc }",
        "tpmus2.   { -sumpt > -sum }",
        "tpec2iv.  { -cept > -ceiv }",
        "tulo2v.   { -olut > -olv }",
        "tsis0.    { protect  -sist }",
        "tsi3>     { -ist > -   }",
        "tt1.      { -tt > -t   }",
        "uqi3.     { -iqu > -   } ",
        "ugo1.     { -ogu > -og }",
        "vis3j>    { -siv > -j  }",
        "vie0.     { protect  -eiv }",
        "vi2>      { -iv > -    }",
        "ylb1>     { -bly > -bl }",
        "yli3y>    { -ily > -y  }",
        "ylp0.     { protect  -ply }",
        "yl2>      { -ly > -    }",
        "ygo1.     { -ogy > -og }",
        "yhp1.     { -phy > -ph }",
        "ymo1.     { -omy > -om }",
        "ypo1.     { -opy > -op }",
        "yti3>     { -ity > -   }",
        "yte3>     { -ety > -   }",
        "ytl2.     { -lty > -l  }",
        "yrtsi5.   { -istry > - }",
        "yra3>     { -ary > -   }",
        "yro3>     { -ory > -   }",
        "yfi3.     { -ify > -   }",
        "ycn2t>    { -ncy > -nt }",
        "yca3>     { -acy > -   }",
        "zi2>      { -iz > -    }",
        "zy1s.     { -yz > -ys  }",
        "end0."
    };
    /**	Character for "0" digit.
     */
    protected final static char zeroDigit = '0';

    /*	Array of rules. */
    protected Vector<String> ruleTable;

    /*	Index to rule table.
     *
     *	<p>
     *	For each letter 'a' through 'z', contains the index in
     *	ruleTable for the first rule beginning with the
     *	corresponding letter.  Position 0 is for letter 'a',
     *	position 1 for letter 'b', and so on.  In the default table above,
     *	ruleTableIndex[ 0 ] = 0, ruleTableIndex[ 1 ] = 2, etc.
     *	The index for Letters without a rule are assigned the index
     *	of the next letter which has a rule.
     *	</p>
     */
    protected int[] ruleTableIndex;

    /*	True to remove prefixes when word length is greater than two. */
    protected boolean preStrip;

    /**	Create a Paice/Husk stemmer using the default stemming rules.
     *
     *	@throws		StemmerException if something goes wrong.
     *
     *	<p>
     *	Prefixes are automatically removed from words with more than
     *	two characters.
     *	</p>
     */
    public LancasterStemmer()
    {
        this.preStrip = true;

        loadRules(defaultStemmingRules);
    }

    /**	Create a Paice/Husk stemmer from a string list of rules.
     *
     *	@param	rules	The stemming rules as an array of String.
     *
     *	<p>
     *	Prefixes are automatically removed from words with more than
     *	two characters.
     *	</p>
     */
    public LancasterStemmer(String[] rules)
    {
        this.preStrip = true;

        loadRules(rules);
    }

    /**	Create a Paice/Husk stemmer from a string list of rules.
     *
     *	@param	rules			The stemming rules as an array of String.
     *	@param	preStrip		True to remove prefixes from words with
     *							more than two characters.
     *
     *	<p>
     *	Prefixes are automatically removed from words with more than
     *	two characters.
     *	</p>
     */
    public LancasterStemmer(String[] rules, boolean preStrip)
    {
        this.preStrip = preStrip;

        loadRules(rules);
    }

    /**	Loads the stemming rules.
     *
     *	@param	rules	String array of rules.
     */
    protected void loadRules(String[] rules)
    {
        //	Table of rules.

        ruleTable = new Vector<String>();

        //	Maps letter to index of first rule
        //	in rule table starting with that letter.

        ruleTableIndex = new int[26];

        for (int i = 0; i < 25; i++)
        {
            ruleTableIndex[i] = 0;
        }
        //	Loop over rules and add each
        //	to rule table.

        for (int i = 0; i < rules.length; i++)
        {
            //	Remove blanks from rule and add it
            //	to rule table.

            ruleTable.addElement(rules[i].replaceAll(" ", ""));
        }
        //	Get starting index of rule
        //	for each letter.  Letters without
        //	any rules get the index of the
        //	next letter with a rule.
        char ch = 'a';

        for (int i = 0; i < (rules.length - 1); i++)
        {
            while (((String) ruleTable.elementAt(i)).charAt(0) != ch)
            {
                ch++;
                ruleTableIndex[charCode(ch)] = i;
            }
        }
    }

    /**	Returns index of first vowel in string.
     *
     *	@param	s		String to search for vowel.
     *	@param	last	Last position to search for vowel.
     *
     *	@return			Zero-based index of first vowel in string.
     */
    protected int firstVowel(String s, int last)
    {
        char prevChar = 'a';
        int i;

        for (i = 0;
                (i < last) &&
                (!(vowel(s.charAt(i), prevChar)));
                i++)
        {
            prevChar = s.charAt(i);
        }

        return Math.min(i, last);
    }

    /**	Strip suffixes from a string.
     *
     *	@param	s	The string from which to remove suffixes.
     *
     *	@return		The string with suffixes removed.
     */
    protected String stripSuffixes(String s)
    {
        //	Is the current rule OK.

        int ruleOK = 0;

        //	Are we done stemming a string.

        int done = 0;

        //	Position of last letter in string.

        int lastLetterPos = 0;

        //	Counter for number of characters
        //	to be replaced and length of stemmed
        //	string if rule was applied.

        int replacedCharCount = 0;

        //	Position of first vowel in string.

        int firstVowelPos = 0;

        //	Index into rule table.

        int currentRuleIndex = 0;

        //	Index of current rule.

        int ruleCharPos = 0;

        //	Index of word.

        int wordCharPos = 0;

        //	Last letter in string.

        char lastLetter = 0;

        //	Holds current stemming rule.

        String rule = "";

        //	True if the input string has not yet
        //	been stemmed.

        boolean intact = true;

        //	"stem" contains the stemmed input
        //	string in as the stemming process
        //	proceeds.
        //
        //	Start by cleaning the input string
        //	of non-letters.

        String stem = clean(s.toLowerCase());

        //	Set lastLetterPos to the index of the
        //	last letter in the string.  Normally
        //	we will have removed all non-letters
        //	from the string before we get here,
        //	so usually posLastletter will just be
        //	one less than the length of the string.

        lastLetterPos = 0;

        while (((lastLetterPos + 1) < stem.length()) &&
                isLetter(stem.charAt(lastLetterPos + 1)))
        {
            lastLetterPos++;
        }

        if (lastLetterPos < 1)
        {
            done = -1;
        }
        else
        //	Find position of first vowel in string.
        {
            firstVowelPos = firstVowel(stem, lastLetterPos);
            wordCharPos = stem.length() - 1;
        }
        //	Repeat rule processing until
        //	no more rules apply, i.e.,
        //	stemming is complete.

        while (done != -1)
        {
            //	Look for rule for new final letter.
            done = 0;
            //	Get last letter in string.

            lastLetter = stem.charAt(lastLetterPos);

            //	Are there are any possible rules
            //	for stemming for this letter?

            if (isLetter(lastLetter) &&
                    (lastLetter >= 'a') &&
                    (lastLetter <= 'z'))
            {
                currentRuleIndex = ruleTableIndex[charCode(lastLetter)];
            }
            else
            {
                currentRuleIndex = -1;
            }
            //	No rule available -- stemming done.

            if (currentRuleIndex == -1)
            {
                done = -1;
                continue;
            }
            //	Pick up first pontentially matching
            //	rule.

            rule = (String) ruleTable.elementAt(currentRuleIndex);

            while (done == 0)
            {
                ruleOK = 0;

                if (rule.charAt(0) != lastLetter)
                {
                    //	Rule letter changed.  We're done
                    //	with this letter.

                    done = -1;
                    ruleOK = -1;
                }
                //	Index in rule: second character.

                ruleCharPos = 1;

                //	Index in stemmed string:
                //	next to last letter.

                wordCharPos = lastLetterPos - 1;

                //	Loop over rules and try to find
                //	a rule that is acceptable.

                while (ruleOK == 0)
                {
                    //	Is rule fully matched?

                    if (isDigit(rule.charAt(ruleCharPos)))
                    {
                        ruleOK = 1;
                    }
                    else if (rule.charAt(ruleCharPos) == '*')
                    {
                        //	Match only if word intact.

                        if (intact)
                        {
                            //	Move forwards in rule.

                            ruleCharPos++;
                            ruleOK = 1;
                        }
                        else
                        {
                            ruleOK = -1;
                        }
                    }
                    //	Mismatch of letters.
                    else if (rule.charAt(ruleCharPos) !=
                            stem.charAt(wordCharPos))
                    {
                        ruleOK = -1;
                    }
                    //	Insufficient stem remaining.
                    else if (wordCharPos <= firstVowelPos)
                    {
                        ruleOK = -1;
                    }
                    //	Compare next pair of letters.
                    //	Move forwards in rule and
                    //	backwards in string.
                    else
                    {
                        ruleCharPos++;
                        wordCharPos--;
                    }
                }
                //	If the rule that has just been checked
                //	is valid for the current stem value,
                //	check the acceptability conditions
                //	for the current stem value.

                if (ruleOK == 1)
                {
                    //	Count replacement letters.

                    replacedCharCount = 0;

                    while (!((rule.charAt(
                            ruleCharPos + replacedCharCount + 1) >=
                            '.') &&
                            (rule.charAt(
                            ruleCharPos + replacedCharCount + 1) <=
                            '>')))
                    {
                        replacedCharCount++;
                    }

                    replacedCharCount =
                            lastLetterPos + replacedCharCount + zeroDigit -
                            ((int) (rule.charAt(ruleCharPos)));

                    //	Position of last letter if rule used.

                    if (firstVowelPos == 0)
                    {
                        //	If word starts with vowel...

                        if (replacedCharCount < 1)
                        {
                            //	... minimal stem is 2 letters.

                            ruleOK = -1;
                        }
                    }
                    //	If word starts with a consonant,
                    //	minimal stem is 3 letters
                    //	including one or more vowels.
                    else if ((replacedCharCount < 2) ||
                            (replacedCharCount < firstVowelPos))
                    {
                        ruleOK = -1;
                    }
                }
                //	If using rule passes the assertion
                //	tests, apply the matching rule.

                if (ruleOK == 1)
                {
                    //	Input string is no longer intact.

                    intact = false;

                    //	Move end of string marker to position
                    //	given by the numeral in the rule.

                    lastLetterPos =
                            lastLetterPos + zeroDigit -
                            ((int) (rule.charAt(ruleCharPos)));

                    ruleCharPos++;

                    stem = stem.substring(0, (lastLetterPos + 1));

                    //	Append any letters following numeral
                    //	to the string.
                    while ((ruleCharPos < rule.length()) &&
                            isLetter(rule.charAt(ruleCharPos)))
                    {
                        stem += rule.charAt(ruleCharPos);

                        ruleCharPos++;
                        lastLetterPos++;
                    }
                    //	Rule ends with '.'.  We're done.

                    if ((rule.charAt(ruleCharPos)) == '.')
                    {
                        done = -1;
                    }
                    else
                    {
                        //	Here if rule ends with '>'.  Continue.

                        done = 1;
                    }
                }
                else
                {
                    //	Rule did not match.
                    //	Try next rule in rule table.

                    currentRuleIndex++;

                    rule =
                            (String) ruleTable.elementAt(currentRuleIndex);

                    //	When the initial letter changes,
                    //	there are no more rules to try.

                    if (rule.charAt(0) != lastLetter)
                    {
                        done = -1;
                    }
                }
            }
        }

        return stem;
    }

    /**	Is character an English vowel?
     *
     *	@param	c	Character to check.
     *
     *	@return		true if "c" is a, e, i, o, u.
     */
    public boolean isEnglishVowel(char c)
    {
        return (c == 'a') ||
                (c == 'e') ||
                (c == 'i') ||
                (c == 'o') ||
                (c == 'u');
    }

    /**	Determine if character is a vowel or not.
     *
     *	@param	ch		The potential vowel.
     *	@param	prev	The previous character.
     *
     *	@return			true if the character is a vowel.
     *
     *	<p>
     *	When the character is a "y", the previous character is
     *	checked to see if it is a vowel.  If so, "y" is not considered
     *	a vowel.
     *	</p>
     */
    protected boolean vowel(char ch, char prev)
    {
        boolean result = isEnglishVowel(ch);

        if (!result && (ch == 'y'))
        {
            result = !isEnglishVowel(prev);
        }

        return result;
    }

    /**	Determine if character is a digit.
     *
     *	@param	ch		The character to check.
     *
     *	@return			true if "ch" is a digit ('0' .. '9').
     */
    protected boolean isDigit(char ch)
    {
        return Character.isDigit(ch);
    }

    /**	Determine if character is a letter.
     *
     *	@param	ch		The character to check.
     *
     *	@return			true if "ch" is a letter ('a' .. 'z').
     */
    protected boolean isLetter(char ch)
    {
        return Character.isLetter(ch);
    }

    /**	Converts a lower case letter to an index.
     *
     *	@param	ch	The character.  Must be in the range 'a' .. 'z'.
     *
     *	@return		The index, where 'a' = 0 .
     */
    protected int charCode(char ch)
    {
        return ((int) ch) - 'a';
    }

    /**	Removes prefixes from a string.
     *
     *	@param	s		The string from which to remove prefixes.
     *
     *	@return			The string with prefixes removed.
     */
    protected String stripPrefixes(String s)
    {
        String result = s;
        String sLower = s.toLowerCase();

        //	Remove any prefix from string
        //	as long as the string is longer
        //	than the prefix.

        for (int i = 0; i < prefixes.length; i++)
        {
            if ((sLower.startsWith(prefixes[i])) &&
                    (sLower.length() > prefixes[i].length()))
            {
                result = s.substring(prefixes[i].length());
                break;
            }
        }

        return result;
    }

    /**	Remove non-letters from a string.
     *
     *	@param	s	String from which to remove non-letters.
     *
     *	@return		String with non-letters removed.
     */
    protected String clean(String s)
    {
        StringBuffer result = new StringBuffer();

        for (int i = 0; i < s.length(); i++)
        {
            if (isLetter(s.charAt(i)))
            {
                result.append(s.charAt(i));
            }
        }

        return result.toString();
    }

    /**	Stem a specified string.
     *
     *	@param	s	The string to stem.
     *
     *	@return		The stemmed string.
     */
    public String stem(String s)
    {
        //	Copy input string to be stemmed.

        String result = s;

        //	Remove prefixes if the input string
        //	is longer than three characters and
        //	prefix stripping was requested.

        if ((result.length() > 3) && preStrip)
        {
            result = stripPrefixes(result);
        }
        //	Remove suffixes if the string
        //	is longer than three characters.

        if (result.length() > 3)
        {
            result = stripSuffixes(result);
        }

        return result;
    }
}

